data.names <- c("symboling", "normalized-losses", "wheel-base", "length",
"width", "height", "curb-weight", "engine-size", "bore",
"stroke", "compression-ratio", "horsepower", "peak-rpm",
"city-mpg", "highway-mpg", "price")
data <- read.csv(file = 'price.data',
sep = ",",
header = FALSE,
col.names = data.names)
pander(t(head(data)))
| 1 | 2 | 3 | 4 | 5 | 6 | |
|---|---|---|---|---|---|---|
| symboling | 2 | 2 | 1 | 1 | 2 | 0 |
| normalized.losses | 164 | 164 | 158 | 158 | 192 | 192 |
| wheel.base | 99.8 | 99.4 | 105.8 | 105.8 | 101.2 | 101.2 |
| length | 176.6 | 176.6 | 192.7 | 192.7 | 176.8 | 176.8 |
| width | 66.2 | 66.4 | 71.4 | 71.4 | 64.8 | 64.8 |
| height | 54.3 | 54.3 | 55.7 | 55.9 | 54.3 | 54.3 |
| curb.weight | 2337 | 2824 | 2844 | 3086 | 2395 | 2395 |
| engine.size | 109 | 136 | 136 | 131 | 108 | 108 |
| bore | 3.19 | 3.19 | 3.19 | 3.13 | 3.5 | 3.5 |
| stroke | 3.4 | 3.4 | 3.4 | 3.4 | 2.8 | 2.8 |
| compression.ratio | 10 | 8 | 8.5 | 8.3 | 8.8 | 8.8 |
| horsepower | 102 | 115 | 110 | 140 | 101 | 101 |
| peak.rpm | 5500 | 5500 | 5500 | 5500 | 5800 | 5800 |
| city.mpg | 24 | 18 | 19 | 17 | 23 | 23 |
| highway.mpg | 30 | 22 | 25 | 20 | 29 | 29 |
| price | 13950 | 17450 | 17710 | 23875 | 16430 | 16925 |
Cars are initially assigned a risk factor symbol associated with its price.
Then, if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuarians call this process symboling. A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.
normalized.losses) is also considered. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc…), and represents the average loss per car per year.
print(c("ggplot2", "MASS", "TSA", "stats", "leaps", "reshape2", "broom", "glmnet", "caret"))
## [1] "ggplot2" "MASS" "TSA" "stats" "leaps" "reshape2"
## [7] "broom" "glmnet" "caret"
159 instances; 14 continuous variables; 1 nominal variables (symboling);
The \(y\) is “price”.
car::scatterplotMatrix(data)
data.cor <- cor(data)
data.cor.melt <- melt(data.cor)
ggplot(data = data.cor.melt, aes(x=Var1, y=Var2, fill=value))+geom_tile()
ggplot(stack(as.data.frame(scale(data))), aes(x = ind, y = values))+
geom_boxplot()
ggplot(data, aes(x = factor(symboling), y = price))+geom_boxplot()
price <- data$price
data <- data[,-16]
model <- lm(price ~., data = data)
pander(model_assesment(model, price))
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE |
|---|---|---|---|---|---|---|
| 0.8508 | 0.8351 | 2942 | 2994 | 1.145e+09 | 0.7903 | 2264 |
XX <- cbind(rep(1,159), as.matrix(data))
YY <- as.matrix(price)
MLE <- solve((t(XX)%*%XX))%*%(t(XX))%*%YY
pander(matrix(data = c(MLE, model$coefficients),
nrow = 16,
dimnames = list(c("Intercept",row.names(MLE)[2:16]), c("MLE", "LSE"))))
| MLE | LSE | |
|---|---|---|
| Intercept | -59421 | -59421 |
| symboling | 79.76 | 79.76 |
| normalized.losses | 7.139 | 7.139 |
| wheel.base | 197.9 | 197.9 |
| length | -92.54 | -92.54 |
| width | 767.1 | 767.1 |
| height | 38.89 | 38.89 |
| curb.weight | 5.093 | 5.093 |
| engine.size | 49.91 | 49.91 |
| bore | -1814 | -1814 |
| stroke | -1839 | -1839 |
| compression.ratio | 104.1 | 104.1 |
| horsepower | 26.11 | 26.11 |
| peak.rpm | 0.7534 | 0.7534 |
| city.mpg | 18.95 | 18.95 |
| highway.mpg | -13.46 | -13.46 |
LB.test(model)
##
## Box-Ljung test
##
## data: residuals from model
## X-squared = 116.64, df = 12, p-value < 2.2e-16
shapiro.test(model$res)
##
## Shapiro-Wilk normality test
##
## data: model$res
## W = 0.96248, p-value = 0.000265
par(mfrow = c(1,1))
b <- boxcox(price~.,data=data)
lambda <- b$x[which(b$y==max(b$y))]
price.bc <- (price^lambda-1)/lambda
model.bc <- lm(price.bc~.,data = data)
pander(model_assesment(model.bc, price.bc))
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE |
|---|---|---|---|---|---|---|
| 0.8902 | 0.8787 | -1507 | -1454 | 0.000755 | 0.8562 | 0.001904 |
price.bc.rm <- price.bc[-which(cooks.distance(model.bc) > 0.05)]
model.bc.rm <- lm(price.bc.rm ~.,
data = data[-which(cooks.distance(model.bc) > 0.05),])
pander(model_assesment(model.bc.rm, price.bc.rm))
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE |
|---|---|---|---|---|---|---|
| 0.9096 | 0.8997 | -1490 | -1439 | 0.0005331 | 0.8858 | 0.001661 |
pander(car::vif(model.bc.rm))
| symboling | normalized.losses | wheel.base | length | width | height |
|---|---|---|---|---|---|
| 2.553 | 2.192 | 8.811 | 8.388 | 7.185 | 2.708 |
| curb.weight | engine.size | bore | stroke | compression.ratio | horsepower |
|---|---|---|---|---|---|
| 16.85 | 9.319 | 2.588 | 1.536 | 2.948 | 7.632 |
| peak.rpm | city.mpg | highway.mpg |
|---|---|---|
| 2.107 | 26.18 | 21.91 |
model.for <- step(model.bc.rm, direction = "forward")
pander(names(model.for$coefficients))
(Intercept), symboling, normalized.losses, wheel.base, length, width, height, curb.weight, engine.size, bore, stroke, compression.ratio, horsepower, peak.rpm, city.mpg and highway.mpg
pander(model_assesment(model.for, price.bc.rm))
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE |
|---|---|---|---|---|---|---|
| 0.9096 | 0.8997 | -1490 | -1439 | 0.0005331 | 0.8858 | 0.001661 |
model.bac <- step(model.bc.rm, direction = "backward")
pander(names(model.bac$coefficients))
(Intercept), normalized.losses, wheel.base, length, curb.weight, engine.size, compression.ratio, horsepower and city.mpg
pander(model_assesment(model.bac, price.bc.rm))
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE |
|---|---|---|---|---|---|---|
| 0.9068 | 0.9017 | -1500 | -1469 | 0.0004959 | 0.8938 | 0.001686 |
model.stepw <- step(model.bc.rm, direction = "both")
pander(names(model.stepw$coefficients))
(Intercept), normalized.losses, wheel.base, length, curb.weight, engine.size, compression.ratio, horsepower and city.mpg
pander(model_assesment(model.stepw, price.bc.rm))
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE |
|---|---|---|---|---|---|---|
| 0.9068 | 0.9017 | -1500 | -1469 | 0.0004959 | 0.8938 | 0.001686 |
library(leaps)
leaps(data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
ll <- regsubsets(data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm, nbest=15)
par(mfrow = c(1,2))
plot(ll,scale='adjr2')
plot(ll,scale='bic')
plot(ll,scale='r2')
plot(ll,scale='Cp')
pca_x <- princomp(data[-which(cooks.distance(model.bc) > 0.05),])
par(mfrow = c(1,1))
screeplot(pca_x, type="lines", main = deparse(substitute("Scree Plot")))
biplot(pca_x)
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
summary(pca_x)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 524.9956893 401.0934664 32.92800503 1.812614e+01
## Proportion of Variance 0.6291576 0.3672308 0.00247502 7.499955e-04
## Cumulative Proportion 0.6291576 0.9963884 0.99886339 9.996134e-01
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 1.024533e+01 5.639159e+00 4.329375e+00 2.271777e+00
## Proportion of Variance 2.396071e-04 7.258997e-05 4.278567e-05 1.178093e-05
## Cumulative Proportion 9.998530e-01 9.999256e-01 9.999684e-01 9.999802e-01
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 2.152711e+00 1.443324e+00 9.402942e-01 7.659430e-01
## Proportion of Variance 1.057839e-05 4.755273e-06 2.018252e-06 1.339186e-06
## Cumulative Proportion 9.999907e-01 9.999955e-01 9.999975e-01 9.999988e-01
## Comp.13 Comp.14 Comp.15
## Standard deviation 6.475052e-01 2.482170e-01 1.591906e-01
## Proportion of Variance 9.570496e-07 1.406407e-07 5.784723e-08
## Cumulative Proportion 9.999998e-01 9.999999e-01 1.000000e+00
loadings(pca_x)
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## symboling
## normalized.losses 0.986 -0.156
## wheel.base -0.281 -0.301
## length -0.889 0.305 0.143
## width -0.134
## height
## curb.weight -0.746 -0.661
## engine.size 0.102 0.502 -0.844 -0.119
## bore
## stroke
## compression.ratio -0.104 0.123 0.500 -0.777
## horsepower 0.115 0.833 0.482 0.212
## peak.rpm 0.664 -0.747
## city.mpg -0.129 0.213 0.521 0.227
## highway.mpg -0.125 0.176 0.570 0.463
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## symboling 0.185 0.395 0.621 0.645
## normalized.losses
## wheel.base -0.864 0.127 0.229
## length 0.271
## width -0.103 -0.165 0.179 0.656 -0.688
## height 0.977 -0.152
## curb.weight
## engine.size
## bore 0.295 0.954
## stroke -0.954 0.295
## compression.ratio 0.302 -0.128
## horsepower
## peak.rpm
## city.mpg -0.166 -0.674 0.329 0.126
## highway.mpg 0.574 -0.225 -0.133
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.067 0.067 0.067 0.067 0.067 0.067 0.067 0.067
## Cumulative Var 0.067 0.133 0.200 0.267 0.333 0.400 0.467 0.533
## Comp.9 Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.067 0.067 0.067 0.067 0.067 0.067 0.067
## Cumulative Var 0.600 0.667 0.733 0.800 0.867 0.933 1.000
# pca_x$scores
pca_data <- predict(pca_x)
reg.data <- data.frame(pca_x$scores[,1:3], price.bc.rm)
model_pca <- lm(price.bc.rm~., data=reg.data)
pander(model_assesment(model_pca, price.bc.rm))
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE |
|---|---|---|---|---|---|---|
| 0.8587 | 0.8558 | -1446 | -1431 | 0.0006981 | 0.8505 | 0.002076 |
summary(model_pca)
##
## Call:
## lm(formula = price.bc.rm ~ ., data = reg.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0079376 -0.0009421 -0.0001152 0.0013305 0.0061123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.122e+00 1.701e-04 12475.033 <2e-16 ***
## Comp.1 -7.272e-06 3.240e-07 -22.444 <2e-16 ***
## Comp.2 -8.484e-06 4.241e-07 -20.006 <2e-16 ***
## Comp.3 6.243e-06 5.166e-06 1.209 0.229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.002104 on 149 degrees of freedom
## Multiple R-squared: 0.8587, Adjusted R-squared: 0.8558
## F-statistic: 301.8 on 3 and 149 DF, p-value: < 2.2e-16
pander(assesment)
| R2 | R2_Adj | AIC | BIC | PRESS | R2_Pred | RMSE | |
|---|---|---|---|---|---|---|---|
| model | 0.8508 | 0.8351 | 2942 | 2994 | 1.145e+09 | 0.7903 | 2264 |
| model.bc | 0.8902 | 0.8787 | -1507 | -1454 | 0.000755 | 0.8562 | 0.001904 |
| model.bc.rm | 0.9096 | 0.8997 | -1490 | -1439 | 0.0005331 | 0.8858 | 0.001661 |
| model.pca | 0.8587 | 0.8558 | -1446 | -1431 | 0.0006981 | 0.8505 | 0.002076 |
| model.bac | 0.9068 | 0.9017 | -1500 | -1469 | 0.0004959 | 0.8938 | 0.001686 |
| model.for | 0.9096 | 0.8997 | -1490 | -1439 | 0.0005331 | 0.8858 | 0.001661 |
| model.stepw | 0.9068 | 0.9017 | -1500 | -1469 | 0.0004959 | 0.8938 | 0.001686 |
cv_fit <- cv.glmnet(as.matrix(data[-which(cooks.distance(model.bc) > 0.05),]), price.bc.rm, alpha =0)
plot(cv_fit)
opt_lambda <- cv_fit$lambda.min
y_predicted <- predict(cv_fit, s = opt_lambda, newx = as.matrix(data[-which(cooks.distance(model.bc) > 0.05),]))
SST_rid <- sum((price.bc.rm - mean(price.bc.rm))^2)
SSE_rid <- sum((y_predicted - price.bc.rm)^2)
rid_assessment <- list(R2_rid = 1 - SSE_rid/SST_rid,
RMSE_rid = caret::RMSE(y_predicted, price.bc.rm),
MAE_rid = caret::MAE(y_predicted, price.bc.rm))
pander(rid_assessment)
cv_fit1 <- cv.glmnet(as.matrix(data[-which(cooks.distance(model.bc) > 0.05), ]), price.bc.rm, alpha = 1)
plot(cv_fit1)
opt_lambda1 <- cv_fit1$lambda.min
y_predicted1 <- predict(cv_fit1, s = opt_lambda1, newx = as.matrix(data[-which(cooks.distance(model.bc) > 0.05),]))
SST_las <- sum((price.bc.rm-mean(price.bc.rm))^2)
SSE_las <- sum((y_predicted1-price.bc.rm)^2)
las_assessment <- list(R2_las=1-SSE_las/SST_las,
RMSE_las = caret::RMSE(y_predicted1, price.bc.rm),
MAE_las = caret::MAE(y_predicted1,price.bc.rm))
pander(las_assessment)
Cross-validation refers to a set of methods for measuring the performance of a given predictive model on new test data sets.
The basic idea, behind cross-validation techniques, consists of dividing the data into two sets:
The training set, used to train (i.e. build) the model;
and the testing set (or validation set), used to test (i.e. validate) the model by estimating the prediction error.
Cross-validation is also known as a resampling method because it involves fitting the same statistical method multiple times using different subsets of the data.
We described several statistical metrics for quantifying the overall quality of regression models. These include:
R-squared (R2), representing the squared correlation between the observed outcome values and the predicted values by the model. The higher the adjusted R2, the better the model.
Root Mean Squared Error (RMSE), which measures the average prediction error made by the model in predicting the outcome for an observation. That is, the average difference between the observed known outcome values and the values predicted by the model. The lower the RMSE, the better the model.
3.Mean Absolute Error (MAE), an alternative to the RMSE that is less sensitive to outliers. It corresponds to the average absolute difference between observed and predicted outcomes. The lower the MAE, the better the model.
R2, RMSE and MAE are used to measure the regression model performance during cross-validationvalidation(model_pca, reg.data[,-4], reg.data[,4])
## R2 R2.adj RMSE MAE
## 1 0.8615524 0.8581198 0.002104196 0.001582264
validation(model.bc.rm, data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
## R2 R2.adj RMSE MAE
## 1 0.8897861 0.874619 0.001882494 0.001516456
validation(model.stepw, data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
## R2 R2.adj RMSE MAE
## 1 0.8989515 0.8919827 0.001803714 0.001447884
validation_rid_or_las(cv_fit, data[-which(cooks.distance(model.bc) > 0.05),
],price.bc.rm, alpha=0)
## R2 RMSE MAE
## 1 0.8927893 0.001846552 0.001457047
validation_rid_or_las(cv_fit1, data[-which(cooks.distance(model.bc) > 0.05),
],price.bc.rm, alpha=1)
## R2 RMSE MAE
## 1 0.8932267 0.001849885 0.001465516
This method works as follow:
# model
leave_one_out(data, price)
## Linear Regression
##
## 159 samples
## 15 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 158, 158, 158, 158, 158, 158, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2683.154 0.7908478 1894.326
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model.bc.rm
leave_one_out(data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
## Linear Regression
##
## 153 samples
## 15 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 152, 152, 152, 152, 152, 152, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.001866648 0.8860003 0.001497278
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model_pca
leave_one_out(reg.data[,-4], reg.data[,4])
## Linear Regression
##
## 153 samples
## 3 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 152, 152, 152, 152, 152, 152, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.002136 0.8505018 0.00158211
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model.stepw
leave_one_out(data[-which(cooks.distance(model.bc) > 0.05), names(model.stepw$coefficients)[2:9]], price.bc.rm)
## Linear Regression
##
## 153 samples
## 8 predictor
##
## No pre-processing
## Resampling: Leave-One-Out Cross-Validation
## Summary of sample sizes: 152, 152, 152, 152, 152, 152, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.001800251 0.8938775 0.001435684
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model
k_fold_cross_validation(data, price)
## Linear Regression
##
## 159 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 143, 143, 143, 143, 143, 143, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2589.227 0.8100677 1883.753
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model.bc.rm
k_fold_cross_validation(data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
## Linear Regression
##
## 153 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 137, 138, 138, 137, 137, 138, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.001848829 0.8952209 0.001511077
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model_pca
k_fold_cross_validation(reg.data[,-4], reg.data[,4])
## Linear Regression
##
## 153 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 138, 137, 138, 137, 138, 138, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.002100624 0.8708737 0.001585104
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model.stepw
k_fold_cross_validation(data[-which(cooks.distance(model.bc) > 0.05), names(model.stepw$coefficients)[2:9]], price.bc.rm)
## Linear Regression
##
## 153 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 137, 139, 137, 138, 138, 137, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.001749473 0.9035505 0.00142258
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Ridge
K_fold_cross_ridge(data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm )
## Ridge Regression
##
## 153 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 137, 137, 137, 138, 137, 139, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 0.001846699 0.8936462 0.001508353
## 1e-04 0.001846348 0.8936936 0.001507916
## 1e-01 0.001868511 0.9008948 0.001472374
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.
# LASSO
K_fold_cross_lasso (data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
## The lasso
##
## 153 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 137, 137, 137, 138, 139, 138, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 0.004603955 0.8445625 0.003878651
## 0.5 0.001963056 0.8867621 0.001536946
## 0.9 0.001820111 0.8939786 0.001492225
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
The process of splitting the data into k-folds can be repeated a number of times, this is called repeated k-fold cross validation.
The final model error is taken as the mean error from the number of repeats.# model
repeated_K_fold_cross_validation(data, price)
## Linear Regression
##
## 159 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 143, 143, 144, 143, 143, 143, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 2630.184 0.8246088 1891.454
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model.bc.rm
repeated_K_fold_cross_validation(data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
## Linear Regression
##
## 153 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 138, 138, 137, 137, 138, 137, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.001893724 0.8931495 0.001557475
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model_pca
repeated_K_fold_cross_validation(reg.data[,-4], reg.data[,4])
## Linear Regression
##
## 153 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 138, 137, 139, 137, 138, 137, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.002094413 0.8656624 0.001574099
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# model.stepw
repeated_K_fold_cross_validation(data[-which(cooks.distance(model.bc) > 0.05), names(model.stepw$coefficients)[2:9]], price.bc.rm)
## Linear Regression
##
## 153 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 139, 140, 138, 137, 137, 137, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.001797577 0.9000276 0.001438249
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# Ridge
repeated_k_fold_ridge(data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm )
## Ridge Regression
##
## 153 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 137, 139, 138, 137, 138, 137, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 0.001796333 0.8954076 0.001473279
## 1e-04 0.001795938 0.8954519 0.001472749
## 1e-01 0.001804583 0.9032393 0.001441200
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.
# LASSO
repeated_k_fold_lasso (data[-which(cooks.distance(model.bc) > 0.05),], price.bc.rm)
## The lasso
##
## 153 samples
## 15 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 138, 138, 139, 137, 137, 137, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 0.004612633 0.8511393 0.003875674
## 0.5 0.001992502 0.8857051 0.001537437
## 0.9 0.001885925 0.8960765 0.001496187
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
[1]Box-Cox Transformations: An Overview;Pengfei Li;Apr 2005
[2]Building a Robust Linear Model with Forward Selection and Stepwise Procedures;Jafar A. Khan, Stefan Van Aelst, Ruben H. Zamar;January 2007
[3]Best Subset Selection via a Modern Optimization Lens ; Dimitris Bertsimas,Angela King, Rahul Mazumder; June 2014
[4]Overview:Principal component analysis;Herve Abdi 1, Lynne J. Williams;August 2010
[5]A survey of cross-validation procedures for model selection;Sylvain Arlot;July 2009
[6]Regression shrinkage and selection via the lasso;ROBERT TIBSHIRANI;January 1994
[7]Ridge Regression in Practice;DONALD W. MARQUARD;Aug 2013
[8]Regularization and variable selection via the elastic net;Hui Zou, Trevor Hastie;September 2004
[9]A tutorial on support vector regression;ALEX J. SMOLA; November 2003
Comments: the result is not good.